Data Processing

Reading the data from CSV

library(readr)
library(tidyverse)
sheet1 = read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

Selecting Area_name,STCOU, column ending with “D” column and renaming Area_name

sheet2=sheet1%>%select(Area_name,STCOU,ends_with("D"))%>%rename("area_name"=Area_name)
sheet2

Converting data into Long Format based on enrollment value

sheet3=sheet2%>%pivot_longer(cols =3:12,names_to = "enrollment_value",values_to = "number")
sheet3

Splitting the enrollment string into measurement and year.

sheet3$Measurements=substr(sheet3$enrollment_value,1,7)
sheet3$Year= as.numeric(format(as.Date(substr(sheet3$enrollment_value,8,9),format="%y"),"%Y"))
sheet3

Creating two data set, county and non-county

sheet_county=sheet3[c(grep(pattern = ", \\w\\w", sheet3$area_name)),]
sheet_not_county=sheet3[-c(grep(pattern = ", \\w\\w", sheet3$area_name)),]
class(sheet_county) = c("county", class(sheet_county))
class(sheet_not_county) = c("state", class(sheet_not_county))

Creating new column of state from county data

sheet_county$state=str_sub(sheet_county$area_name,-2,-1)
sheet_county

Creating division in non-county data

division1=(c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island","Vermont"))
division2=c("New Jersey", "New York","Pennsylvania")
division3=c("Illinois", "Indiana", "Michigan","Ohio","Wisconsin")
division4=c("Iowa", "Kansas", "Minnesota","Missouri", "Nebraska","North Dakota","South Dakota")
division5=c("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
division6=c("ALABAMA", "Kentucky", "Mississippi","Tennessee")
division7=c("Arkansas", "Louisiana", "Oklahoma","Texas")
division8=c("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
division9=c("Alaska", "California", "Hawaii", "Oregon","Washington")
length=nrow(sheet_not_county)
sheet_not_county["division"]=NA
for (i in 1:length){
if (sheet_not_county$area_name[i] %in% toupper(division1)){
  sheet_not_county$division[i]="New England"
} else if (sheet_not_county$area_name[i] %in% toupper(division2)){
  sheet_not_county$division[i]="mid_Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division3)){
  sheet_not_county$division[i]="East North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division4)){
  sheet_not_county$division[i]="West North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division5)){
  sheet_not_county$division[i]="South Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division6)){
  sheet_not_county$division[i]="East South Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division7)){
  sheet_not_county$division[i]="West South Central "
}else if (sheet_not_county$area_name[i] %in% toupper(division8)){
  sheet_not_county$division[i]="Mountain"
}else if (sheet_not_county$area_name[i] %in% toupper(division9)){
  sheet_not_county$division[i]="Pacific"
}else {
  sheet_not_county$division[i]="ERROR"
}
}
sheet_not_county

Requirements

Creating individual function from step 1 to 6

part2=read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
function_for_step_1_2= function(part, def_value = "enrollment_value"){
  part_1 = part%>%select(Area_name,STCOU,ends_with("D"))%>%rename("area_name"=Area_name)
  part_2 = part_1%>%pivot_longer(cols = 3:12,names_to = def_value,values_to = "number")
  return(part_2)
}

 function_for_step_3= function(part_2){
  part_2$Year = as.numeric(format(as.Date(substr(part_2$enrollment_value,8,9),format="%y"),"%Y"))
  part_2$Measurements=substr(part_2$enrollment_value,1,7)
 
  return(part_2)
}

func_5 = function(df_county){
  df_county$state=str_sub(df_county$area_name,-2,-1)
  
  return (df_county)
}

func_6 = function(sheet_not_county){
division1=(c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island","Vermont"))
division2=c("New Jersey", "New York","Pennsylvania")
division3=c("Illinois", "Indiana", "Michigan","Ohio","Wisconsin")
division4=c("Iowa", "Kansas", "Minnesota","Missouri", "Nebraska","North Dakota","South Dakota")
division5=c("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
division6=c("ALABAMA", "Kentucky", "Mississippi","Tennessee")
division7=c("Arkansas", "Louisiana", "Oklahoma","Texas")
division8=c("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
division9=c("Alaska", "California", "Hawaii", "Oregon","Washington")
length=nrow(sheet_not_county)
sheet_not_county["division"]<-NA
for (i in 1:length){
if (sheet_not_county$area_name[i] %in% toupper(division1)){
  sheet_not_county$division[i]="New England"
} else if (sheet_not_county$area_name[i] %in% toupper(division2)){
  sheet_not_county$division[i]="mid_Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division3)){
  sheet_not_county$division[i]="East North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division4)){
  sheet_not_county$division[i]="West North Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division5)){
  sheet_not_county$division[i]="South Atlantic"
}else if (sheet_not_county$area_name[i] %in% toupper(division6)){
  sheet_not_county$division[i]="East South Central"
}else if (sheet_not_county$area_name[i] %in% toupper(division7)){
  sheet_not_county$division[i]="West South Central "
}else if (sheet_not_county$area_name[i] %in% toupper(division8)){
  sheet_not_county$division[i]="Mountain"
}else if (sheet_not_county$area_name[i] %in% toupper(division9)){
  sheet_not_county$division[i]="Pacific"
}else {
  sheet_not_county$division[i]="ERROR"
}
}
return (sheet_not_county)
}

function_for_steps4_5_6= function(part_2){
  df_county = part_2[c(grep(pattern = ", \\w\\w", part_2$area_name)),]
  df_not_county = part_2[-c(grep(pattern = ", \\w\\w", part_2$area_name)),]
  class(df_county) = c("county", class(df_county))
  class(df_not_county) = c("state", class(df_not_county))
  part_5 = func_5(df_county)
  part_6 = func_6(df_not_county)
  return(list(part_5,part_6))

}

Creating Wrapper function

my_wrapper = function(url, def_value = "enrollment_value"){
  a = read_csv(url)
  b = function_for_step_1_2(a,def_value)
  c =  function_for_step_3(b)
  d = function_for_steps4_5_6(c)
  return(list(d[[1]], d[[2]]))
}
my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
## [[1]]
## # A tibble: 31,450 × 7
##    area_name   STCOU enrollment_value number  Year Measurements state
##    <chr>       <chr> <chr>             <dbl> <dbl> <chr>        <chr>
##  1 Autauga, AL 01001 EDU010187D         6829  1987 EDU0101      AL   
##  2 Autauga, AL 01001 EDU010188D         6900  1988 EDU0101      AL   
##  3 Autauga, AL 01001 EDU010189D         6920  1989 EDU0101      AL   
##  4 Autauga, AL 01001 EDU010190D         6847  1990 EDU0101      AL   
##  5 Autauga, AL 01001 EDU010191D         7008  1991 EDU0101      AL   
##  6 Autauga, AL 01001 EDU010192D         7137  1992 EDU0101      AL   
##  7 Autauga, AL 01001 EDU010193D         7152  1993 EDU0101      AL   
##  8 Autauga, AL 01001 EDU010194D         7381  1994 EDU0101      AL   
##  9 Autauga, AL 01001 EDU010195D         7568  1995 EDU0101      AL   
## 10 Autauga, AL 01001 EDU010196D         7834  1996 EDU0101      AL   
## # … with 31,440 more rows
## 
## [[2]]
## # A tibble: 530 × 7
##    area_name     STCOU enrollment_value   number  Year Measurements division
##    <chr>         <chr> <chr>               <dbl> <dbl> <chr>        <chr>   
##  1 UNITED STATES 00000 EDU010187D       40024299  1987 EDU0101      ERROR   
##  2 UNITED STATES 00000 EDU010188D       39967624  1988 EDU0101      ERROR   
##  3 UNITED STATES 00000 EDU010189D       40317775  1989 EDU0101      ERROR   
##  4 UNITED STATES 00000 EDU010190D       40737600  1990 EDU0101      ERROR   
##  5 UNITED STATES 00000 EDU010191D       41385442  1991 EDU0101      ERROR   
##  6 UNITED STATES 00000 EDU010192D       42088151  1992 EDU0101      ERROR   
##  7 UNITED STATES 00000 EDU010193D       42724710  1993 EDU0101      ERROR   
##  8 UNITED STATES 00000 EDU010194D       43369917  1994 EDU0101      ERROR   
##  9 UNITED STATES 00000 EDU010195D       43993459  1995 EDU0101      ERROR   
## 10 UNITED STATES 00000 EDU010196D       44715737  1996 EDU0101      ERROR   
## # … with 520 more rows

Call It and Combine Your Data

Creating a Binding function to bind the county and not data of two csv and finally combining it into one dataframe

data_1 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

data_2 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")

bind_func <- function(data_1, data_2){
  
  first_county <- bind_rows(data_1[[1]], data_2[[1]])
  first_not_county <- bind_rows(data_1[[2]], data_2[[2]])
  
  return(list(first_county, first_not_county))
}
bind_county <- bind_func(data_1, data_2)[[1]]
bind_not_county <- bind_func(data_1, data_2)[[2]]
class(bind_county) = c("county", class(bind_county))
class(bind_not_county) = c("state", class(bind_not_county))
bind_county_ncounty <- function(a,b){
  df <- bind_rows(a,b)
  return(df)
}
bind_county
bind_not_county
#bind_county_ncounty(bind_county, bind_not_county)

Writing a Generic Function for Summarizing

Plotting not county data with mean value of the enrollment across the years for each Division

plot.state = function(bind_not_county,var_name="number"){
  mean_enrollment = bind_not_county %>%
    filter(division != "ERROR") %>%
    group_by(Year, division) %>%
    summarise(Mean = mean(get(var_name),na.rm=TRUE))
  ggplot(mean_enrollment, aes(x =Year, y =Mean, color = division)) + geom_line()+geom_point()
}

Plotting county data allowing user to specify the state of interest, arrangement and count based on enrollment

plot.county = function(bind_county, var_name = "number", State = "AZ", arrangement = "top", count = 5){
  firstfilter_county = bind_county %>%
    filter(state ==State) %>%
    group_by(area_name) %>%
    summarise(MeanVal = mean(get(var_name), na.rm = TRUE))
  if(arrangement == "top"){
    firstfilter_county = head(firstfilter_county[order(-firstfilter_county$MeanVal),],count)
  }else{
    firstfilter_county = head(firstfilter_county[order(firstfilter_county$MeanVal),],count)
  }
  
  
  secondfilter_df=bind_county[bind_county$area_name %in% firstfilter_county$area_name, ]
  ggplot(secondfilter_df, aes(x = Year, y = get(var_name),color = area_name)) + geom_line() + geom_point()
}

Put it all together

data_1 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv",)

data_2 = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")


bind_county <- bind_func(data_1, data_2)[[1]]
bind_not_county <- bind_func(data_1, data_2)[[2]]



class(bind_county) = c("county", class(bind_county))
class(bind_not_county) = c("state", class(bind_not_county))
plot.state(bind_not_county)

plot.county(bind_county, State = "PA", arrangement = "top", count = 7)

plot.county(bind_county, State = "PA", arrangement = "bottom", count = 4)

plot.county(bind_county)

plot.county(bind_county, State = "MN", arrangement  = "top", count = 10)

first = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv" )
second = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
third = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv" )
fourth = my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv")
FirstSecond = bind_func(first, second)
ThirdFourth = bind_func(third, fourth)
FirsttoFourth = bind_func(FirstSecond, ThirdFourth)
FirsttoFourth
## [[1]]
## # A tibble: 125,800 × 7
##    area_name   STCOU enrollment_value number  Year Measurements state
##    <chr>       <chr> <chr>             <dbl> <dbl> <chr>        <chr>
##  1 Autauga, AL 01001 PST015171D        25508  1971 PST0151      AL   
##  2 Autauga, AL 01001 PST015172D        27166  1972 PST0151      AL   
##  3 Autauga, AL 01001 PST015173D        28463  1973 PST0151      AL   
##  4 Autauga, AL 01001 PST015174D        29266  1974 PST0151      AL   
##  5 Autauga, AL 01001 PST015175D        29718  1975 PST0151      AL   
##  6 Autauga, AL 01001 PST015176D        29896  1976 PST0151      AL   
##  7 Autauga, AL 01001 PST015177D        30462  1977 PST0151      AL   
##  8 Autauga, AL 01001 PST015178D        30882  1978 PST0151      AL   
##  9 Autauga, AL 01001 PST015179D        32055  1979 PST0151      AL   
## 10 Autauga, AL 01001 PST025181D        31985  1981 PST0251      AL   
## # … with 125,790 more rows
## 
## [[2]]
## # A tibble: 2,120 × 7
##    area_name     STCOU enrollment_value    number  Year Measurements division
##    <chr>         <chr> <chr>                <dbl> <dbl> <chr>        <chr>   
##  1 UNITED STATES 00000 PST015171D       206827028  1971 PST0151      ERROR   
##  2 UNITED STATES 00000 PST015172D       209283904  1972 PST0151      ERROR   
##  3 UNITED STATES 00000 PST015173D       211357490  1973 PST0151      ERROR   
##  4 UNITED STATES 00000 PST015174D       213341552  1974 PST0151      ERROR   
##  5 UNITED STATES 00000 PST015175D       215465246  1975 PST0151      ERROR   
##  6 UNITED STATES 00000 PST015176D       217562728  1976 PST0151      ERROR   
##  7 UNITED STATES 00000 PST015177D       219759860  1977 PST0151      ERROR   
##  8 UNITED STATES 00000 PST015178D       222095080  1978 PST0151      ERROR   
##  9 UNITED STATES 00000 PST015179D       224567234  1979 PST0151      ERROR   
## 10 UNITED STATES 00000 PST025181D       229466391  1981 PST0251      ERROR   
## # … with 2,110 more rows
plot.state(FirsttoFourth[[2]])

plot.county(FirsttoFourth[[1]], State = "CT", arrangement = "top", count = 6)

plot.county(FirsttoFourth[[1]], State = "NC", arrangement = "bottom", count = 10)

plot.county(FirsttoFourth[[1]])

plot.county(FirsttoFourth[[1]], State = "MN", arrangement = "top", count = 4)